Calling packages

packages_to_use<- c("ROCR", "tidyverse", "caret", "glmnet", "rpart", "rpart.plot", "vip", "pdp", "randomForest", "gbm", "GGally", "cowplot", "dplyr", "DALEX", "DALEXtra", "lime", "localModel", "gridExtra")

for(i in packages_to_use){
  if( ! i %in% rownames(installed.packages())  ) {
    print(paste(i, "not installed; installing now:\n") )
    install.packages(i)
  }
  
  require(i, character.only = TRUE)
}
## Loading required package: ROCR
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: caret
## 
## Loading required package: lattice
## 
## 
## Attaching package: 'caret'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## 
## Loading required package: glmnet
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loaded glmnet 4.1-8
## 
## Loading required package: rpart
## 
## Loading required package: rpart.plot
## 
## Loading required package: vip
## 
## 
## Attaching package: 'vip'
## 
## 
## The following object is masked from 'package:utils':
## 
##     vi
## 
## 
## Loading required package: pdp
## 
## 
## Attaching package: 'pdp'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     partial
## 
## 
## Loading required package: randomForest
## 
## randomForest 4.7-1.2
## 
## Type rfNews() to see new features/changes/bug fixes.
## 
## 
## Attaching package: 'randomForest'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## 
## Loading required package: gbm
## 
## Loaded gbm 2.2.2
## 
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
## 
## Loading required package: GGally
## 
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Loading required package: cowplot
## 
## 
## Attaching package: 'cowplot'
## 
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
## 
## 
## Loading required package: DALEX
## 
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## Additional features will be available after installation of: ggpubr.
## Use 'install_dependencies()' to get all suggested dependencies
## 
## 
## Attaching package: 'DALEX'
## 
## 
## The following object is masked from 'package:vip':
## 
##     titanic
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     explain
## 
## 
## Loading required package: DALEXtra
## 
## Loading required package: lime
## 
## 
## Attaching package: 'lime'
## 
## 
## The following object is masked from 'package:DALEX':
## 
##     explain
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     explain
## 
## 
## Loading required package: localModel
## 
## Loading required package: gridExtra
## 
## 
## Attaching package: 'gridExtra'
## 
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Part 1: Data Exploration

Taking in data

sact.dat <- read.csv("sac.csv")

#combining variables
sact.dat <- sact.dat %>%
  mutate(paredu = (Medu + Fedu) ) %>% 
  mutate(grades = (G1 + G2 + G3)/3)

head(sact.dat)
##   school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob     reason
## 1     GP   F  18       U     GT3       A    4    4  at_home  teacher     course
## 2     GP   F  17       U     GT3       T    1    1  at_home    other     course
## 3     GP   F  15       U     LE3       T    1    1  at_home    other      other
## 4     GP   F  15       U     GT3       T    4    2   health services       home
## 5     GP   F  16       U     GT3       T    3    3    other    other       home
## 6     GP   M  16       U     LE3       T    4    3 services    other reputation
##   guardian traveltime studytime failures schoolsup famsup paid activities
## 1   mother          2         2        0       yes     no   no         no
## 2   father          1         2        0        no    yes   no         no
## 3   mother          1         2        0       yes     no   no         no
## 4   mother          1         3        0        no    yes   no        yes
## 5   father          1         2        0        no    yes   no         no
## 6   mother          1         2        0        no    yes   no        yes
##   nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1     yes    yes       no       no      4        3     4    1    1      3
## 2      no    yes      yes       no      5        3     3    1    1      3
## 3     yes    yes      yes       no      4        3     2    2    3      3
## 4     yes    yes      yes      yes      3        2     2    1    1      5
## 5     yes    yes       no       no      4        3     2    1    2      5
## 6     yes    yes      yes       no      5        4     2    1    2      5
##   absences G1 G2 G3 paredu    grades
## 1        4  0 11 11      8  7.333333
## 2        2  9 11 11      2 10.333333
## 3        6 12 13 12      2 12.333333
## 4        0 14 14 14      6 14.000000
## 5        0 11 13 13      6 12.333333
## 6        6 12 12 13      7 12.333333
psych::describe(sact.dat)
##             vars   n  mean   sd median trimmed  mad   min   max range  skew
## school*        1 649  1.35 0.48   1.00    1.31 0.00  1.00  2.00  1.00  0.64
## sex*           2 649  1.41 0.49   1.00    1.39 0.00  1.00  2.00  1.00  0.37
## age            3 649 16.74 1.22  17.00   16.70 1.48 15.00 22.00  7.00  0.41
## address*       4 649  1.70 0.46   2.00    1.74 0.00  1.00  2.00  1.00 -0.85
## famsize*       5 649  1.30 0.46   1.00    1.25 0.00  1.00  2.00  1.00  0.89
## Pstatus*       6 649  1.88 0.33   2.00    1.97 0.00  1.00  2.00  1.00 -2.29
## Medu           7 649  2.51 1.13   2.00    2.53 1.48  0.00  4.00  4.00 -0.03
## Fedu           8 649  2.31 1.10   2.00    2.27 1.48  0.00  4.00  4.00  0.21
## Mjob*          9 649  2.94 1.25   3.00    2.93 1.48  1.00  5.00  4.00 -0.19
## Fjob*         10 649  3.22 0.86   3.00    3.29 0.00  1.00  5.00  4.00 -0.53
## reason*       11 649  2.11 1.19   2.00    2.02 1.48  1.00  4.00  3.00  0.56
## guardian*     12 649  1.83 0.52   2.00    1.83 0.00  1.00  3.00  2.00 -0.20
## traveltime    13 649  1.57 0.75   1.00    1.43 0.00  1.00  4.00  3.00  1.24
## studytime     14 649  1.93 0.83   2.00    1.85 1.48  1.00  4.00  3.00  0.70
## failures      15 649  0.22 0.59   0.00    0.07 0.00  0.00  3.00  3.00  3.08
## schoolsup*    16 649  1.10 0.31   1.00    1.01 0.00  1.00  2.00  1.00  2.57
## famsup*       17 649  1.61 0.49   2.00    1.64 0.00  1.00  2.00  1.00 -0.46
## paid*         18 649  1.06 0.24   1.00    1.00 0.00  1.00  2.00  1.00  3.69
## activities*   19 649  1.49 0.50   1.00    1.48 0.00  1.00  2.00  1.00  0.06
## nursery*      20 649  1.80 0.40   2.00    1.88 0.00  1.00  2.00  1.00 -1.52
## higher*       21 649  1.89 0.31   2.00    1.99 0.00  1.00  2.00  1.00 -2.55
## internet*     22 649  1.77 0.42   2.00    1.83 0.00  1.00  2.00  1.00 -1.26
## romantic*     23 649  1.37 0.48   1.00    1.34 0.00  1.00  2.00  1.00  0.55
## famrel        24 649  3.93 0.96   4.00    4.05 1.48  1.00  5.00  4.00 -1.10
## freetime      25 649  3.18 1.05   3.00    3.19 1.48  1.00  5.00  4.00 -0.18
## goout         26 649  3.18 1.18   3.00    3.20 1.48  1.00  5.00  4.00 -0.01
## Dalc          27 649  1.50 0.92   1.00    1.28 0.00  1.00  5.00  4.00  2.13
## Walc          28 649  2.28 1.28   2.00    2.14 1.48  1.00  5.00  4.00  0.63
## health        29 649  3.54 1.45   4.00    3.67 1.48  1.00  5.00  4.00 -0.50
## absences      30 649  3.66 4.64   2.00    2.80 2.97  0.00 32.00 32.00  2.01
## G1            31 649 11.40 2.75  11.00   11.38 2.97  0.00 19.00 19.00  0.00
## G2            32 649 11.57 2.91  11.00   11.56 2.97  0.00 19.00 19.00 -0.36
## G3            33 649 11.91 3.23  12.00   12.04 2.97  0.00 19.00 19.00 -0.91
## paredu        34 649  4.82 2.03   5.00    4.79 2.97  0.00  8.00  8.00  0.12
## grades        35 649 11.63 2.83  11.67   11.64 2.47  1.33 18.67 17.33 -0.23
##             kurtosis   se
## school*        -1.60 0.02
## sex*           -1.87 0.02
## age             0.05 0.05
## address*       -1.28 0.02
## famsize*       -1.21 0.02
## Pstatus*        3.23 0.01
## Medu           -1.27 0.04
## Fedu           -1.12 0.04
## Mjob*          -0.83 0.05
## Fjob*           1.17 0.03
## reason*        -1.25 0.05
## guardian*       0.17 0.02
## traveltime      1.08 0.03
## studytime       0.02 0.03
## failures        9.70 0.02
## schoolsup*      4.64 0.01
## famsup*        -1.79 0.02
## paid*          11.66 0.01
## activities*    -2.00 0.02
## nursery*        0.31 0.02
## higher*         4.50 0.01
## internet*      -0.41 0.02
## romantic*      -1.71 0.02
## famrel          1.32 0.04
## freetime       -0.41 0.04
## goout          -0.87 0.05
## Dalc            4.28 0.04
## Walc           -0.78 0.05
## health         -1.13 0.06
## absences        5.70 0.18
## G1              0.02 0.11
## G2              1.63 0.11
## G3              2.66 0.13
## paredu         -1.14 0.08
## grades          0.58 0.11
#selecting columns of interest based on low skewness
sapply(sact.dat, var)
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
##     school        sex        age    address    famsize    Pstatus       Medu 
##         NA         NA  1.4838593         NA         NA         NA  1.2872082 
##       Fedu       Mjob       Fjob     reason   guardian traveltime  studytime 
##  1.2098480         NA         NA         NA         NA  0.5604919  0.6880861 
##   failures  schoolsup     famsup       paid activities    nursery     higher 
##  0.3519279         NA         NA         NA         NA         NA         NA 
##   internet   romantic     famrel   freetime      goout       Dalc       Walc 
##         NA         NA  0.9133948  1.1047956  1.3824260  0.8553187  1.6496319 
##     health   absences         G1         G2         G3     paredu     grades 
##  2.0916652 21.5366423  7.5364806  8.4892903 10.4371398  4.1130657  8.0279305
sacf.dat <- select(sact.dat, Mjob, Fjob, guardian, famsup, activities, romantic, freetime, health, goout, paredu, sex, age, reason, grades)

print(head(sacf.dat))
##       Mjob     Fjob guardian famsup activities romantic freetime health goout
## 1  at_home  teacher   mother     no         no       no        3      3     4
## 2  at_home    other   father    yes         no       no        3      3     3
## 3  at_home    other   mother     no         no       no        3      3     2
## 4   health services   mother    yes        yes      yes        2      5     2
## 5    other    other   father    yes         no       no        3      5     2
## 6 services    other   mother    yes        yes       no        4      5     2
##   paredu sex age     reason    grades
## 1      8   F  18     course  7.333333
## 2      2   F  17     course 10.333333
## 3      2   F  15      other 12.333333
## 4      6   F  15       home 14.000000
## 5      6   F  16       home 12.333333
## 6      7   M  16 reputation 12.333333
print(str(sacf.dat))
## 'data.frame':    649 obs. of  14 variables:
##  $ Mjob      : chr  "at_home" "at_home" "at_home" "health" ...
##  $ Fjob      : chr  "teacher" "other" "other" "services" ...
##  $ guardian  : chr  "mother" "father" "mother" "mother" ...
##  $ famsup    : chr  "no" "yes" "no" "yes" ...
##  $ activities: chr  "no" "no" "no" "yes" ...
##  $ romantic  : chr  "no" "no" "no" "yes" ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ paredu    : int  8 2 2 6 6 7 4 8 5 7 ...
##  $ sex       : chr  "F" "F" "F" "F" ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ reason    : chr  "course" "course" "other" "home" ...
##  $ grades    : num  7.33 10.33 12.33 14 12.33 ...
## NULL

##–Details about the dataset: Student Alcohol Consumption–##

#school - student’s school (binary: ‘GP’ - Gabriel Pereira or ‘MS’ - Mousinho da Silveira)

#sex - student’s sex (binary: ‘F’ - female or ‘M’ - male)

#age - student’s age (numeric: from 15 to 22)

#address - student’s home address type (binary: ‘U’ - urban or ‘R’ - rural)

#famsize - family size (binary: ‘LE3’ - less or equal to 3 or ‘GT3’ - greater than 3)

#Pstatus - parent’s cohabitation status (binary: ‘T’ - living together or ‘A’ - apart)

#Medu - mother’s education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)

#Fedu - father’s education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)

#Mjob - mother’s job (nominal: ‘teacher’, ‘health’ care related, civil ‘services’ (e.g. administrative or police), ‘at_home’ or ‘other’)

#Fjob - father’s job (nominal: ‘teacher’, ‘health’ care related, civil ‘services’ (e.g. administrative or police), ‘at_home’ or ‘other’)

#reason - reason to choose this school (nominal: close to ‘home’, school ‘reputation’, ‘course’ preference or ‘other’)

#guardian - student’s guardian (nominal: ‘mother’, ‘father’ or ‘other’)

#traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)

#studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)

#failures - number of past class failures (numeric: n if 1<=n<3, else 4)

#schoolsup - extra educational support (binary: yes or no)

#famsup - family educational support (binary: yes or no)

#paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)

#activities - extra-curricular activities (binary: yes or no)

#nursery - attended nursery school (binary: yes or no)

#higher - wants to take higher education (binary: yes or no)

#internet - Internet access at home (binary: yes or no)

#romantic - with a romantic relationship (binary: yes or no)

#famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)

#freetime - free time after school (numeric: from 1 - very low to 5 - very high)

#goout - going out with friends (numeric: from 1 - very low to 5 - very high)

#Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)

#Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)

#health - current health status (numeric: from 1 - very bad to 5 - very good)

#absences - number of school absences (numeric: from 0 to 93)

#G1 - first period grade (numeric: from 0 to 20)

#G2 - second period grade (numeric: from 0 to 20)

#G3 - final grade (numeric: from 0 to 20, output target)

##added variables:

#paredu - parent’s education out of 8 (higher = higher combined education levels of both parents)

#grade - mean grade (numeric: from 0 to 20)

Factoring variables in the dataset

#sex
sacf.dat$sex <- factor(sacf.dat$sex,
                       levels = c("F", "M"),
                       labels = c("Female", "Male"))

#Mjob
sacf.dat$Mjob <- factor(sacf.dat$Mjob,
                        levels = c("teacher", "health","services","at_home","other"),
                        labels = c("teacher", "health","services","at_home","other"))

#Fjob
sacf.dat$Fjob <- factor(sacf.dat$Fjob,
                        levels = c("teacher", "health","services","at_home","other"),
                        labels = c("teacher", "health","services","at_home","other"))

#guardian
sacf.dat$guardian <- factor(sacf.dat$guardian,
                            levels = c("mother", "father","other"),
                            labels = c("mother", "father","other"))


#famsup
sacf.dat$famsup <- factor(sacf.dat$famsup,
                          levels = c("no", "yes"),
                          labels = c("no", "yes"))


#activities
sacf.dat$activities <- factor(sacf.dat$activities,
                              levels = c("no", "yes"),
                              labels = c("no", "yes"))

#romantic
sacf.dat$romantic <- factor(sacf.dat$romantic,
                            levels = c("no", "yes"),
                            labels = c("no", "yes"))

#freetime
sacf.dat$freetime <- factor(sacf.dat$freetime,
                            levels = c("1","2","3","4","5"), 
                            labels = c("1","2","3","4","5"))

#goout
sacf.dat$goout <- factor(sacf.dat$goout,
                         levels = c("1","2","3","4", "5"), 
                         labels = c("1","2","3","4", "5"))

#reason
sacf.dat$reason <- factor(sacf.dat$reason,
                          levels = c("home", "reputation","course","other"),
                          labels = c("home", "reputation","course","other"))

#health
sacf.dat$health <- factor(sacf.dat$health,
                       levels = c("1","2","3","4","5"), 
                       labels = c("1","2","3","4","5"))
head(sacf.dat)
##       Mjob     Fjob guardian famsup activities romantic freetime health goout
## 1  at_home  teacher   mother     no         no       no        3      3     4
## 2  at_home    other   father    yes         no       no        3      3     3
## 3  at_home    other   mother     no         no       no        3      3     2
## 4   health services   mother    yes        yes      yes        2      5     2
## 5    other    other   father    yes         no       no        3      5     2
## 6 services    other   mother    yes        yes       no        4      5     2
##   paredu    sex age     reason    grades
## 1      8 Female  18     course  7.333333
## 2      2 Female  17     course 10.333333
## 3      2 Female  15      other 12.333333
## 4      6 Female  15       home 14.000000
## 5      6 Female  16       home 12.333333
## 6      7   Male  16 reputation 12.333333
str(sacf.dat)
## 'data.frame':    649 obs. of  14 variables:
##  $ Mjob      : Factor w/ 5 levels "teacher","health",..: 4 4 4 2 5 3 5 5 3 5 ...
##  $ Fjob      : Factor w/ 5 levels "teacher","health",..: 1 5 5 3 5 5 5 1 5 5 ...
##  $ guardian  : Factor w/ 3 levels "mother","father",..: 1 2 1 1 2 1 1 1 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ freetime  : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 2 3 4 4 1 2 5 ...
##  $ health    : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 5 5 3 1 1 5 ...
##  $ goout     : Factor w/ 5 levels "1","2","3","4",..: 4 3 2 2 2 2 4 4 2 1 ...
##  $ paredu    : int  8 2 2 6 6 7 4 8 5 7 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ reason    : Factor w/ 4 levels "home","reputation",..: 3 3 4 1 1 2 1 1 1 1 ...
##  $ grades    : num  7.33 10.33 12.33 14 12.33 ...
summary(sacf.dat)
##        Mjob           Fjob       guardian   famsup    activities romantic 
##  teacher : 72   teacher : 36   mother:455   no :251   no :334    no :410  
##  health  : 48   health  : 23   father:153   yes:398   yes:315    yes:239  
##  services:136   services:181   other : 41                                 
##  at_home :135   at_home : 42                                              
##  other   :258   other   :367                                              
##                                                                           
##  freetime health  goout       paredu          sex           age       
##  1: 45    1: 90   1: 48   Min.   :0.000   Female:383   Min.   :15.00  
##  2:107    2: 78   2:145   1st Qu.:3.000   Male  :266   1st Qu.:16.00  
##  3:251    3:124   3:205   Median :5.000                Median :17.00  
##  4:178    4:108   4:141   Mean   :4.821                Mean   :16.74  
##  5: 68    5:249   5:110   3rd Qu.:6.000                3rd Qu.:18.00  
##                           Max.   :8.000                Max.   :22.00  
##         reason        grades      
##  home      :149   Min.   : 1.333  
##  reputation:143   1st Qu.:10.000  
##  course    :285   Median :11.667  
##  other     : 72   Mean   :11.625  
##                   3rd Qu.:13.333  
##                   Max.   :18.667
psych::describe(sacf.dat) #skew is high for guardian, Mjob, Fjob. 
##             vars   n  mean   sd median trimmed  mad   min   max range  skew
## Mjob*          1 649  3.71 1.35   4.00    3.88 1.48  1.00  5.00  4.00 -0.73
## Fjob*          2 649  4.05 1.22   5.00    4.24 0.00  1.00  5.00  4.00 -0.95
## guardian*      3 649  1.36 0.60   1.00    1.25 0.00  1.00  3.00  2.00  1.43
## famsup*        4 649  1.61 0.49   2.00    1.64 0.00  1.00  2.00  1.00 -0.46
## activities*    5 649  1.49 0.50   1.00    1.48 0.00  1.00  2.00  1.00  0.06
## romantic*      6 649  1.37 0.48   1.00    1.34 0.00  1.00  2.00  1.00  0.55
## freetime*      7 649  3.18 1.05   3.00    3.19 1.48  1.00  5.00  4.00 -0.18
## health*        8 649  3.54 1.45   4.00    3.67 1.48  1.00  5.00  4.00 -0.50
## goout*         9 649  3.18 1.18   3.00    3.20 1.48  1.00  5.00  4.00 -0.01
## paredu        10 649  4.82 2.03   5.00    4.79 2.97  0.00  8.00  8.00  0.12
## sex*          11 649  1.41 0.49   1.00    1.39 0.00  1.00  2.00  1.00  0.37
## age           12 649 16.74 1.22  17.00   16.70 1.48 15.00 22.00  7.00  0.41
## reason*       13 649  2.43 0.96   3.00    2.41 1.48  1.00  4.00  3.00 -0.20
## grades        14 649 11.63 2.83  11.67   11.64 2.47  1.33 18.67 17.33 -0.23
##             kurtosis   se
## Mjob*          -0.66 0.05
## Fjob*          -0.18 0.05
## guardian*       0.95 0.02
## famsup*        -1.79 0.02
## activities*    -2.00 0.02
## romantic*      -1.71 0.02
## freetime*      -0.41 0.04
## health*        -1.13 0.06
## goout*         -0.87 0.05
## paredu         -1.14 0.08
## sex*           -1.87 0.02
## age             0.05 0.05
## reason*        -1.04 0.04
## grades          0.58 0.11
sum(is.na(sacf.dat)) #there are no missing values in the dataframe
## [1] 0
#ggpairs to look at overall patterns
plotgg1 <- GGally::ggpairs(sacf.dat, 
                          progress = FALSE, 
                          alpha = 0.2)
## Warning in warn_if_args_exist(list(...)): Extra arguments: "alpha" are being
## ignored.  If these are meant to be aesthetics, submit them using the 'mapping'
## variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
suppressWarnings(print(plotgg1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

age.count <- ggplot(sacf.dat, aes(x = age, fill = age, color = age  ) ) +
  geom_histogram(alpha = 0.5, bins = 10) +
  theme(legend.position = "right")

paredu.count <- ggplot(sacf.dat, aes(x = paredu, fill = paredu, color = paredu  ) ) +
  geom_histogram(alpha = 0.5, bins = 10) +
  theme(legend.position = "right")


plot_grid(age.count, paredu.count, labels=c( "age", "paredu"), ncol = 2, nrow = 1)
## Warning: The following aesthetics were dropped during statistical transformation: fill
## and colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## and colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Scatterplots

num.dat <- select_if(sacf.dat, is.numeric)
head(num.dat)
##   paredu age    grades
## 1      8  18  7.333333
## 2      2  17 10.333333
## 3      2  15 12.333333
## 4      6  15 14.000000
## 5      6  16 12.333333
## 6      7  16 12.333333
age<- ggplot(sacf.dat, aes(x = grades, y = age)) +
  geom_point(position = position_jitter(0.33), col = "red", alpha=0.3) + 
  geom_smooth(method = lm, se = FALSE,linetype ="dashed") +
  #facet_wrap(~sex) +
  theme(legend.position = "right")

paredu<- ggplot(sacf.dat, aes(x = grades, y = paredu )) +
  geom_point(position = position_jitter(0.33), col = "green", alpha=0.3) + 
  geom_smooth(method = lm, se = FALSE,linetype ="dashed") +
  #facet_wrap(~sex) +
  theme(legend.position = "right")

plot_grid(age, paredu,labels=c("age", "paredu"), ncol = 2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#the spread seems to be quite even, with a few outliers

Histogram

cat.dat <- select_if(sacf.dat, is.factor)
head(cat.dat)
##       Mjob     Fjob guardian famsup activities romantic freetime health goout
## 1  at_home  teacher   mother     no         no       no        3      3     4
## 2  at_home    other   father    yes         no       no        3      3     3
## 3  at_home    other   mother     no         no       no        3      3     2
## 4   health services   mother    yes        yes      yes        2      5     2
## 5    other    other   father    yes         no       no        3      5     2
## 6 services    other   mother    yes        yes       no        4      5     2
##      sex     reason
## 1 Female     course
## 2 Female     course
## 3 Female      other
## 4 Female       home
## 5 Female       home
## 6   Male reputation
#Histograms for each variables against grades
sex.plot <- ggplot(sacf.dat, aes(x = grades, fill = sex, color = sex  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

Mjob.plot <- ggplot(sacf.dat, aes(x = grades, fill = Mjob, color = Mjob  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

Fjob.plot <- ggplot(sacf.dat, aes(x = grades, fill = Fjob, color = Fjob  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

guardian.plot <- ggplot(sacf.dat, aes(x = grades, fill = guardian, color = guardian  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

famsup.plot <- ggplot(sacf.dat, aes(x = grades, fill = famsup, color = famsup  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

activities.plot <- ggplot(sacf.dat, aes(x = grades, fill = activities, color = activities  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

romantic.plot <- ggplot(sacf.dat, aes(x = grades, fill = romantic, color = romantic  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

freetime.plot <- ggplot(sacf.dat, aes(x = grades, fill = freetime, color = freetime  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

health.plot <- ggplot(sacf.dat, aes(x = grades, fill = health, color = health  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

goout.plot <- ggplot(sacf.dat, aes(x = grades, fill = goout, color = goout  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

reason.plot <- ggplot(sacf.dat, aes(x = grades, fill = reason, color = reason  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

age.plot <- ggplot(sacf.dat, aes(x = grades, fill = age, color = age  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")

paredu.plot <- ggplot(sacf.dat, aes(x = grades, fill = paredu, color = paredu  ) ) +
  geom_histogram(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")


plot_grid(sex.plot, Mjob.plot, Fjob.plot, guardian.plot, famsup.plot, activities.plot, romantic.plot, freetime.plot, health.plot, goout.plot, reason.plot, age.plot, paredu.plot, 
          labels=c("sex", "Mjob","Fjob", "guardian", "famsup", "activities", "romantic", "freetime", "health", "goout",  "reason", "age",  "paredu"), 
          ncol = 3, nrow = 5)
## Warning: The following aesthetics were dropped during statistical transformation: fill
## and colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## and colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Bar graphs

sex.check <- ggplot(sacf.dat, aes(x = sex, fill = sex, color = sex  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
Mjob.check <- ggplot(sacf.dat, aes(x = Mjob, fill = Mjob, color = Mjob  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
Fjob.check <- ggplot(sacf.dat, aes(x = Fjob, fill = Fjob, color = Fjob  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
guardian.check <- ggplot(sacf.dat, aes(x = guardian, fill = guardian, color = guardian  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
famsup.check <- ggplot(sacf.dat, aes(x = famsup, fill = famsup, color = famsup  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
activities.check <- ggplot(sacf.dat, aes(x = activities, fill = activities, color = activities  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
romantic.check <- ggplot(sacf.dat, aes(x = romantic, fill = romantic, color = romantic  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
freetime.check <- ggplot(sacf.dat, aes(x = freetime, fill = freetime, color = freetime  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
health.check <- ggplot(sacf.dat, aes(x = health, fill = health, color = health  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
goout.check <- ggplot(sacf.dat, aes(x = goout, fill = goout, color = goout  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
reason.check <- ggplot(sacf.dat, aes(x = reason, fill = reason, color = reason  ) ) +
  geom_bar(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_bar(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
plot_grid(sex.check, Mjob.check, Fjob.check, guardian.check,  famsup.check, activities.check, romantic.check, freetime.check, health.check, goout.check, reason.check, 
          labels=c("sex", "Mjob","Fjob", "guardian", "famsup", "activities", "romantic", "freetime", "health", "goout", "reason"), 
          ncol = 3, nrow = 4)

#again we see how Mjob, Fjob, and guardian are skewed.
#health does look a little skewed, but the psych::description showed that it is fine, so we will trust that.

Boxplots

#boxplots 
sex.plot1 <- ggplot(sacf.dat, aes(y = grades, x = sex, fill = sex, color = sex  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
Mjob.plot1 <- ggplot(sacf.dat, aes(y = grades, x = Mjob, fill = Mjob, color = Mjob  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
Fjob.plot1 <- ggplot(sacf.dat, aes(y = grades, x = Fjob, fill = Fjob, color = Fjob  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
guardian.plot1 <- ggplot(sacf.dat, aes(y = grades, x = guardian, fill = guardian, color = guardian  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
famsup.plot1 <- ggplot(sacf.dat, aes(y = grades, x = famsup, fill = famsup, color = famsup  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
activities.plot1 <- ggplot(sacf.dat, aes(y = grades, x = activities, fill = activities, color = activities  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
romantic.plot1 <- ggplot(sacf.dat, aes(y = grades, x = romantic, fill = romantic, color = romantic  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
freetime.plot1 <- ggplot(sacf.dat, aes(y = grades, x = freetime, fill = freetime, color = freetime  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
health.plot1 <- ggplot(sacf.dat, aes(y = grades, x = health, fill = health, color = health  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
goout.plot1 <- ggplot(sacf.dat, aes(y = grades, x = goout, fill = goout, color = goout  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
reason.plot1 <- ggplot(sacf.dat, aes(y = grades, x = reason, fill = reason, color = reason  ) ) +
  geom_boxplot(alpha = 0.5, bins = 30) +
  theme(legend.position = "right")
## Warning in geom_boxplot(alpha = 0.5, bins = 30): Ignoring unknown parameters:
## `bins`
plot_grid(sex.plot1, Mjob.plot1, Fjob.plot1, guardian.plot1, famsup.plot1, activities.plot1, romantic.plot1, freetime.plot1, health.plot1, goout.plot1, reason.plot1, 
          labels=c("sex", "Mjob","Fjob", "guardian", "famsup", "activities", "romantic", "freetime", "health", "goout", "reason"), 
          ncol = 3, nrow = 4)

sacf.dat <- sacf.dat %>% 
  mutate(paredu.cat = case_when(paredu > 4 ~ "High",
                                       paredu <= 4 ~ "Low",
                                       ))

head(sacf.dat)
##       Mjob     Fjob guardian famsup activities romantic freetime health goout
## 1  at_home  teacher   mother     no         no       no        3      3     4
## 2  at_home    other   father    yes         no       no        3      3     3
## 3  at_home    other   mother     no         no       no        3      3     2
## 4   health services   mother    yes        yes      yes        2      5     2
## 5    other    other   father    yes         no       no        3      5     2
## 6 services    other   mother    yes        yes       no        4      5     2
##   paredu    sex age     reason    grades paredu.cat
## 1      8 Female  18     course  7.333333       High
## 2      2 Female  17     course 10.333333        Low
## 3      2 Female  15      other 12.333333        Low
## 4      6 Female  15       home 14.000000       High
## 5      6 Female  16       home 12.333333       High
## 6      7   Male  16 reputation 12.333333       High
ggplot(sacf.dat, aes(x = grades, y = paredu, fill = paredu.cat)) +
  geom_point(size = 2, shape = 16, position = position_jitter(0.5)) +
  facet_wrap(~ paredu.cat)

Check for Near-Zero Variance

nearZeroVar(
  cat.dat,
  freqCut = 80/20,
  uniqueCut = 10,
  saveMetrics = TRUE,
  names = FALSE,
  foreach = FALSE,
  allowParallel = TRUE
)
##            freqRatio percentUnique zeroVar   nzv
## Mjob        1.897059     0.7704160   FALSE FALSE
## Fjob        2.027624     0.7704160   FALSE FALSE
## guardian    2.973856     0.4622496   FALSE FALSE
## famsup      1.585657     0.3081664   FALSE FALSE
## activities  1.060317     0.3081664   FALSE FALSE
## romantic    1.715481     0.3081664   FALSE FALSE
## freetime    1.410112     0.7704160   FALSE FALSE
## health      2.008065     0.7704160   FALSE FALSE
## goout       1.413793     0.7704160   FALSE FALSE
## sex         1.439850     0.3081664   FALSE FALSE
## reason      1.912752     0.6163328   FALSE FALSE
#None of the variables failed nzv test. Although this is to check for dichotomized variables 

Selecting final variables to use

#we remove Mjob, Fjob, and guardian due to skewness issue. 

sac.dat <- select(sacf.dat, sex, age, paredu, famsup, activities, romantic, freetime, health, goout, reason, grades)


head(sac.dat)
##      sex age paredu famsup activities romantic freetime health goout     reason
## 1 Female  18      8     no         no       no        3      3     4     course
## 2 Female  17      2    yes         no       no        3      3     3     course
## 3 Female  15      2     no         no       no        3      3     2      other
## 4 Female  15      6    yes        yes      yes        2      5     2       home
## 5 Female  16      6    yes         no       no        3      5     2       home
## 6   Male  16      7    yes        yes       no        4      5     2 reputation
##      grades
## 1  7.333333
## 2 10.333333
## 3 12.333333
## 4 14.000000
## 5 12.333333
## 6 12.333333
str(sac.dat)
## 'data.frame':    649 obs. of  11 variables:
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ paredu    : int  8 2 2 6 6 7 4 8 5 7 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ freetime  : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 2 3 4 4 1 2 5 ...
##  $ health    : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 5 5 3 1 1 5 ...
##  $ goout     : Factor w/ 5 levels "1","2","3","4",..: 4 3 2 2 2 2 4 4 2 1 ...
##  $ reason    : Factor w/ 4 levels "home","reputation",..: 3 3 4 1 1 2 1 1 1 1 ...
##  $ grades    : num  7.33 10.33 12.33 14 12.33 ...
summary(sac.dat)
##      sex           age            paredu      famsup    activities romantic 
##  Female:383   Min.   :15.00   Min.   :0.000   no :251   no :334    no :410  
##  Male  :266   1st Qu.:16.00   1st Qu.:3.000   yes:398   yes:315    yes:239  
##               Median :17.00   Median :5.000                                 
##               Mean   :16.74   Mean   :4.821                                 
##               3rd Qu.:18.00   3rd Qu.:6.000                                 
##               Max.   :22.00   Max.   :8.000                                 
##  freetime health  goout          reason        grades      
##  1: 45    1: 90   1: 48   home      :149   Min.   : 1.333  
##  2:107    2: 78   2:145   reputation:143   1st Qu.:10.000  
##  3:251    3:124   3:205   course    :285   Median :11.667  
##  4:178    4:108   4:141   other     : 72   Mean   :11.625  
##  5: 68    5:249   5:110                    3rd Qu.:13.333  
##                                            Max.   :18.667
psych::describe(sac.dat)
##             vars   n  mean   sd median trimmed  mad   min   max range  skew
## sex*           1 649  1.41 0.49   1.00    1.39 0.00  1.00  2.00  1.00  0.37
## age            2 649 16.74 1.22  17.00   16.70 1.48 15.00 22.00  7.00  0.41
## paredu         3 649  4.82 2.03   5.00    4.79 2.97  0.00  8.00  8.00  0.12
## famsup*        4 649  1.61 0.49   2.00    1.64 0.00  1.00  2.00  1.00 -0.46
## activities*    5 649  1.49 0.50   1.00    1.48 0.00  1.00  2.00  1.00  0.06
## romantic*      6 649  1.37 0.48   1.00    1.34 0.00  1.00  2.00  1.00  0.55
## freetime*      7 649  3.18 1.05   3.00    3.19 1.48  1.00  5.00  4.00 -0.18
## health*        8 649  3.54 1.45   4.00    3.67 1.48  1.00  5.00  4.00 -0.50
## goout*         9 649  3.18 1.18   3.00    3.20 1.48  1.00  5.00  4.00 -0.01
## reason*       10 649  2.43 0.96   3.00    2.41 1.48  1.00  4.00  3.00 -0.20
## grades        11 649 11.63 2.83  11.67   11.64 2.47  1.33 18.67 17.33 -0.23
##             kurtosis   se
## sex*           -1.87 0.02
## age             0.05 0.05
## paredu         -1.14 0.08
## famsup*        -1.79 0.02
## activities*    -2.00 0.02
## romantic*      -1.71 0.02
## freetime*      -0.41 0.04
## health*        -1.13 0.06
## goout*         -0.87 0.05
## reason*        -1.04 0.04
## grades          0.58 0.11
sum(is.na(sac.dat))
## [1] 0
#we kept variables with low skewness but also kept some variables with intermediate skew levels because we are interested in those variables (think they might help predict grades). 

Checking correlations between variables

car::vif(lm(grades ~ ., 
            data = sac.dat,
            ))
##                GVIF Df GVIF^(1/(2*Df))
## sex        1.129465  1        1.062763
## age        1.101425  1        1.049488
## paredu     1.095749  1        1.046780
## famsup     1.078598  1        1.038556
## activities 1.102624  1        1.050059
## romantic   1.080818  1        1.039624
## freetime   1.433898  4        1.046080
## health     1.142574  4        1.016800
## goout      1.397374  4        1.042711
## reason     1.141863  3        1.022356
#correlations between variables and grades are extremely low here. Where 1 means no correlation and larger number means more correlations

Part 2: Modeling

Create Train and Test Set

set.seed(1234) 

train.index <- caret::createDataPartition(sac.dat$grades, p = .7, list = FALSE)

train.dat <- sac.dat[ train.index, ]
test.dat <- sac.dat[ - train.index, ]

dim(train.dat)
## [1] 457  11
dim(test.dat)
## [1] 192  11
my.seed <- 111

set.seed(my.seed)
tr.Control <- trainControl(method = "repeatedcv",
                           number = 10,
                           repeats = 5
                           )

Multiple Linear Regression

set.seed(my.seed)
lm.fit <- train(grades ~ ., 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

lm.fit$results
##   intercept     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1      TRUE 2.576139 0.1642073 2.015726 0.2196639  0.0861487 0.1693247

Regularized Regression

#random search
set.seed(my.seed)
elastic.fit1 <- train(grades ~ ., 
                  data = train.dat, 
                  method = 'glmnet', 
                  trControl = tr.Control,
                  verbose = FALSE,
                  tuneLength = 20,
                  metric = "RMSE",
                  preProcess = c("center", "scale")
                  )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(elastic.fit1)

elastic.fit1$results[ rownames(elastic.fit1$bestTune), ]
##        alpha    lambda     RMSE  Rsquared     MAE    RMSESD RsquaredSD
## 31 0.1473684 0.2750149 2.560243 0.1659912 1.98834 0.2127696 0.08922157
##        MAESD
## 31 0.1535565
unique(elastic.fit1$results$alpha) 
##  [1] 0.1000000 0.1473684 0.1947368 0.2421053 0.2894737 0.3368421 0.3842105
##  [8] 0.4315789 0.4789474 0.5263158 0.5736842 0.6210526 0.6684211 0.7157895
## [15] 0.7631579 0.8105263 0.8578947 0.9052632 0.9526316 1.0000000
#alpha is close to one end of the tuning range.
unique(elastic.fit1$results$lambda)
##  [1] 0.0009186832 0.0014244328 0.0022086055 0.0034244777 0.0053097068
##  [6] 0.0082327842 0.0127650617 0.0197924297 0.0306884745 0.0475829640
## [11] 0.0737781365 0.1143941649 0.1773699578 0.2750149185 0.4264149708
## [16] 0.6611631410 1.0251438831
#grid search
set.seed(my.seed) 
elastic.grid <- expand.grid(alpha = seq(0, 0.3, length.out = 20),
                            lambda = seq(0.1, 0.6, length.out = 20))

set.seed(my.seed)
elastic.fit2 <- train(grades ~ ., 
                      data = train.dat, 
                      method = 'glmnet', 
                      trControl = tr.Control,
                      verbose = FALSE,
                      tuneGrid = elastic.grid,
                      metric = "RMSE",
                      preProcess = c("center", "scale")
                      )

plot(elastic.fit2)

elastic.fit2$results[ rownames(elastic.fit2$bestTune), ]
##          alpha    lambda    RMSE  Rsquared      MAE    RMSESD RsquaredSD
## 130 0.09473684 0.3368421 2.56004 0.1658442 1.988752 0.2142611 0.08915918
##         MAESD
## 130 0.1543687
unique(elastic.fit2$results$alpha) 
##  [1] 0.00000000 0.01578947 0.03157895 0.04736842 0.06315789 0.07894737
##  [7] 0.09473684 0.11052632 0.12631579 0.14210526 0.15789474 0.17368421
## [13] 0.18947368 0.20526316 0.22105263 0.23684211 0.25263158 0.26842105
## [19] 0.28421053 0.30000000
unique(elastic.fit2$results$lambda)
##  [1] 0.1000000 0.1263158 0.1526316 0.1789474 0.2052632 0.2315789 0.2578947
##  [8] 0.2842105 0.3105263 0.3368421 0.3631579 0.3894737 0.4157895 0.4421053
## [15] 0.4684211 0.4947368 0.5210526 0.5473684 0.5736842 0.6000000
#alpha and lambda are not at the end of the tuning range now.


# get the upper and lower RMSE bordering the optimal RMSE
opt.RMSE.index <- which( unique(elastic.fit2$results$RMSE) %in% 
                          elastic.fit2$results[rownames(elastic.fit2$bestTune),])


RMSE.new.range <- unique(elastic.fit2$results$RMSE)[c(opt.RMSE.index -1, 
                                    opt.RMSE.index,
                                    opt.RMSE.index + 1) ]


elastic.fit2$results[elastic.fit2$results$RMSE %in% RMSE.new.range, ]
##          alpha    lambda     RMSE  Rsquared      MAE    RMSESD RsquaredSD
## 129 0.09473684 0.3105263 2.560071 0.1657937 1.988831 0.2144791 0.08898615
## 130 0.09473684 0.3368421 2.560040 0.1658442 1.988752 0.2142611 0.08915918
## 131 0.09473684 0.3631579 2.560179 0.1658411 1.988876 0.2140479 0.08930964
##         MAESD
## 129 0.1549954
## 130 0.1543687
## 131 0.1538091
#how much RMSE varies from one sample to the next = RMSESD/sqrt(50)
cv.sd = elastic.fit2$results [ row.names(elastic.fit2$bestTune),]$RMSESD
cv.n = 50
avCV.SE = cv.sd / sqrt(cv.n)
avCV.SE
## [1] 0.03030109
#We see that the RMSE has not changed much. The variation in RMSE seems much smaller than the CV RMSE SE = 0.03. So we stop.
#RMSE should vary about 0.03 across different samples. However, the RMSE here vary less than 0.01, so we can stop fine tuning here.

Decision Tree

#random search
set.seed(my.seed)
tree.fit1 <- train(grades ~ .,
                   data = train.dat,
                   trControl = tr.Control,
                   method = "rpart",
                   metric = "RMSE",
                   #preProcess = c("center", "scale"),
                   tuneLength = 20
                   )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
tree.fit1$results [ row.names(tree.fit1$bestTune),]
##            cp     RMSE Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 18 0.02313252 2.703663 0.073586 2.130176 0.2343079 0.06035126 0.1599496
plot(tree.fit1)

rpart.plot(tree.fit1$finalModel)

unique(tree.fit1$results$cp)
##  [1] 0.003810606 0.004585483 0.005286322 0.005745640 0.006240667 0.006589220
##  [7] 0.006784420 0.008405313 0.008519346 0.008640897 0.008660516 0.009572490
## [13] 0.010597562 0.011084748 0.011817110 0.012459137 0.014042632 0.023132523
## [19] 0.031686707 0.070890298
#cp is quite close to the extreme end of selected range

#grid search
set.seed(my.seed)
tree.fit2 <- train(grades ~ .,
                   data = train.dat,
                   trControl = tr.Control,
                   method = "rpart",
                   metric = "RMSE",
                   #preProcess = c("center", "scale"),
                   tuneGrid = expand.grid(cp = seq(0.014, 0.032, length.out = 20 ) )
                   )

tree.fit2$results [ row.names(tree.fit2$bestTune),]
##            cp    RMSE   Rsquared      MAE    RMSESD RsquaredSD    MAESD
## 10 0.02252632 2.70047 0.07550309 2.128543 0.2361885 0.05958051 0.161072
plot(tree.fit2)

rpart.plot(tree.fit2$finalModel)

unique(tree.fit2$results$cp)
##  [1] 0.01400000 0.01494737 0.01589474 0.01684211 0.01778947 0.01873684
##  [7] 0.01968421 0.02063158 0.02157895 0.02252632 0.02347368 0.02442105
## [13] 0.02536842 0.02631579 0.02726316 0.02821053 0.02915789 0.03010526
## [19] 0.03105263 0.03200000
#cp is not close to the end of the tuning range, but the number of buckets is too little and we will fine tune that.


split.vec <- c(20, 40, 50, 60, 70, 80)

# create empty dataframe to store results of all minsplit
tree.result.dat <- data.frame()

for(i in split.vec){
set.seed(my.seed)
tree.fit3 <- train(grades ~ .,
                   data = train.dat,
                   trControl = tr.Control, 
                   method = "rpart",
                   metric = "RMSE",
                   #preProcess = c("center", "scale"),
                   tuneGrid = expand.grid(cp = seq(0.014, 0.032, length.out = 20 ) ),
                   control = rpart.control(minsplit = i)
                 )

temp.result <- tree.fit3$results
temp.result$minsplit = i

tree.result.dat <- rbind(tree.result.dat, temp.result)

}

ggplot(tree.result.dat, aes( x = cp, y = RMSE, color = as.factor(minsplit)) ) +
  geom_point() +
  geom_line()

tree.result.dat %>% 
  filter(RMSE == min(RMSE, na.rm = TRUE))
##           cp    RMSE   Rsquared      MAE    RMSESD RsquaredSD    MAESD minsplit
## 1 0.02252632 2.70047 0.07550309 2.128543 0.2361885 0.05958051 0.161072       20
## 2 0.02252632 2.70047 0.07550309 2.128543 0.2361885 0.05958051 0.161072       40
## 3 0.02252632 2.70047 0.07550309 2.128543 0.2361885 0.05958051 0.161072       50
#we will keep minsplit = 40 and cp = 0.02252632
set.seed(my.seed)

tree.fit4 <- train(grades ~ .,
                   data = train.dat,
                   trControl = tr.Control, 
                   method = "rpart",
                   metric = "RMSE",
                   #preProcess = c("center", "scale"),
                   tuneGrid = expand.grid(cp = 0.02252632    ),
                   control = rpart.control(minsplit = 40)
                 )

tree.fit4$results [ row.names(tree.fit4$bestTune),]
##           cp    RMSE   Rsquared      MAE    RMSESD RsquaredSD    MAESD
## 1 0.02252632 2.70047 0.07550309 2.128543 0.2361885 0.05958051 0.161072
#plot(tree.fit4)
rpart.plot(tree.fit4$finalModel)

Random Forest

set.seed(my.seed)
rf.fit1 <- train(grades ~ .,
                 data = train.dat,
                 method = "rf",
                 trControl = tr.Control ,
                 #preProc = c("center", "scale"),  
                 ntree = 1000,
                 metric = "RMSE",
                 tuneGrid = expand.grid(mtry = seq(1, ncol(train.dat)-1))
                 )

#node size here = 10
# bag.fit

rf.fit1$results
##    mtry     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1     1 2.670301 0.1251016 2.121746 0.2255992 0.07271636 0.1450470
## 2     2 2.633737 0.1138724 2.080641 0.2117456 0.06639839 0.1332613
## 3     3 2.630691 0.1115196 2.071496 0.2019564 0.06008428 0.1289168
## 4     4 2.639827 0.1081964 2.073501 0.1990731 0.05932240 0.1281654
## 5     5 2.646452 0.1068010 2.077764 0.1950594 0.05793634 0.1267241
## 6     6 2.653286 0.1052388 2.079790 0.1924161 0.05631559 0.1279332
## 7     7 2.664393 0.1018892 2.088239 0.1942837 0.05558167 0.1306198
## 8     8 2.668567 0.1018751 2.089174 0.1911831 0.05496624 0.1288483
## 9     9 2.671777 0.1018588 2.092629 0.1915110 0.05566215 0.1304676
## 10   10 2.677769 0.1003134 2.098943 0.1951272 0.05527494 0.1310200
rf.fit1$results[ rownames(rf.fit1$bestTune),]
##   mtry     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 3    3 2.630691 0.1115196 2.071496 0.2019564 0.06008428 0.1289168
plot(rf.fit1)

plot.dat <- rf.fit1$results
ggplot(plot.dat, aes(x = mtry, y = Rsquared )) +
  geom_point() +
  geom_line() +
  theme_bw()

#fine tune rf since it looks quite similar to the linear model 
node.size = c(10, 20, 30, 40, 50, 60)
rf2.results.dat <- NULL 
rf.models <- vector("list", length = length(node.size)) 



for(i in 1: length(node.size) ){
  
  cat("\n Iteration: ",i,  " ; Node size =  ",
      node.size[i], "\n")
  
  set.seed(my.seed)
  
  rf.fit2 <- train(grades ~ ., 
                   data = train.dat, 
                   trControl = tr.Control, 
                   method = "rf",
                 # preProc = c("center", "scale"), 
                   ntree = 1000, 
                   tuneGrid = expand.grid(mtry = seq(1, ncol(train.dat)-1) ), 
                   nodesize = node.size[i]
                   )
  
  rf.models[[i]] <- rf.fit2
  
  results.temp <- rf.fit2$results
  results.temp$node.size = node.size[i]
  
  # print(results.temp)
  
  rf2.results.dat <- rbind(rf2.results.dat, results.temp)
}  
## 
##  Iteration:  1  ; Node size =   10 
## 
##  Iteration:  2  ; Node size =   20 
## 
##  Iteration:  3  ; Node size =   30 
## 
##  Iteration:  4  ; Node size =   40 
## 
##  Iteration:  5  ; Node size =   50 
## 
##  Iteration:  6  ; Node size =   60
rf2plot.dat <- rf2.results.dat
rf2plot.dat$node.size <- factor(rf2plot.dat$node.size)
ggplot(rf2plot.dat, aes(x = mtry, y = RMSE, color = node.size )) +
  geom_point() +
  geom_line() +
  theme_bw()

rf2.results.dat[rf2.results.dat$RMSE == min(rf2.results.dat$RMSE), ]
##    mtry     RMSE  Rsquared     MAE    RMSESD RsquaredSD     MAESD node.size
## 57    7 2.587995 0.1456905 2.02236 0.2097109   0.073771 0.1339882        60
#from the graph we can see that both node size 10 and 20 does well, almost equally. We will go with node size 10 for our chosen model.
#since there is not a big change from rf.fit1, we will stop fine tuning here.
rf.fitfinal <- rf.models[[ which(node.size == 60)  ]]
tree.N = c(500, 1000, 2000, 3000)

rf3.results.dat <- NULL 
rf.models2 <- vector("list", length = length(tree.N)) 

for(i in 1:length( tree.N)){
set.seed(30825920)
rf.fit3 <- train(grades ~ .,
                 data = train.dat,
                 method = "rf",
                 trControl = tr.Control ,
                 preProc = c("center", "scale"),  
                 ntree = tree.N[i],
                 tuneGrid = expand.grid(mtry = seq(1, ncol(train.dat)-1)
                         )
)

rf.models2[[i]] <- rf.fit3

results.temp <- rf.fit3$results
results.temp$tree.size = tree.N[i]

rf3.results.dat <- rbind(rf3.results.dat, results.temp)
}
rf3plot.dat <- rf3.results.dat
rf3plot.dat$tree.size <- factor(rf3plot.dat$tree.size)
ggplot(rf3plot.dat, aes(x = mtry, y = RMSE, color = tree.size )) +
  geom_point() +
  geom_line() +
  theme_bw()

rf3.results.dat[rf3.results.dat$RMSE == min(rf3.results.dat$RMSE), ]
##    mtry    RMSE  Rsquared      MAE    RMSESD RsquaredSD    MAESD tree.size
## 12    2 2.63206 0.1155033 2.081163 0.2654382  0.0878663 0.181373      1000
#this did slightly worse than rf.fit2 (otherwise it's the same), so we will use rf.fit2 instead.

Boosting

set.seed(my.seed)
gbm.fit1 <- train(grades ~ ., 
                  data = train.dat, 
                  method = "gbm", 
                  trControl = tr.Control, 
                  preProc = c("center", "scale"),
                  tuneLength = 20,
                  verbose = FALSE,
                  metric = "RMSE"
                  )
plot(gbm.fit1)

gbm.fit1$results[ rownames(gbm.fit1$bestTune),]
##   shrinkage interaction.depth n.minobsinnode n.trees     RMSE Rsquared     MAE
## 2       0.1                 1             10     100 2.568025 0.161945 1.99818
##      RMSESD RsquaredSD     MAESD
## 2 0.2098142 0.08911544 0.1500962
tree.size <- c(50, 100, 500, 1000,  1500, 2000)

gbm.grid <- expand.grid(interaction.depth = c(1, 2, 3, 4),
                        n.trees = tree.size, 
                        shrinkage = c(10^-(1:3)),
                        n.minobsinnode = 10
                        )

set.seed(my.seed)
gbm.fit2 <- train(grades ~ ., 
                 data = train.dat, 
                 method = "gbm", 
                 trControl = tr.Control,  
                 preProc = c("center", "scale"), 
                 tuneGrid = gbm.grid, 
                 verbose = FALSE,
                 metric = "RMSE"
                 )
plot(gbm.fit2)

gbm.fit2$results[ rownames(gbm.fit2$bestTune),]
##    shrinkage interaction.depth n.minobsinnode n.trees     RMSE  Rsquared
## 28      0.01                 1             10    1000 2.563566 0.1640269
##         MAE    RMSESD RsquaredSD     MAESD
## 28 1.993294 0.2114444  0.0891012 0.1492431

Part 3: Comparing Models

model.resamples <- resamples( list(Regression = lm.fit, 
               Elastic = elastic.fit2, 
               Tree = tree.fit4,
               RF = rf.fitfinal,
               Boosting = gbm.fit2))

summary(model.resamples)
## 
## Call:
## summary.resamples(object = model.resamples)
## 
## Models: Regression, Elastic, Tree, RF, Boosting 
## Number of resamples: 50 
## 
## MAE 
##                Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Regression 1.676993 1.888787 2.033141 2.015726 2.119890 2.364281    0
## Elastic    1.707368 1.869172 1.971440 1.988752 2.089666 2.350665    0
## Tree       1.887932 2.002949 2.094138 2.128543 2.219750 2.589130    0
## RF         1.792896 1.930809 2.012714 2.022360 2.114549 2.363991    0
## Boosting   1.671437 1.883625 1.990164 1.993294 2.066425 2.344192    0
## 
## RMSE 
##                Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Regression 2.138816 2.453151 2.560630 2.576139 2.704602 3.163428    0
## Elastic    2.170629 2.431644 2.537092 2.560040 2.686954 3.188372    0
## Tree       2.337741 2.524175 2.634610 2.700470 2.851354 3.305859    0
## RF         2.264037 2.461165 2.567661 2.587995 2.722219 3.191767    0
## Boosting   2.188182 2.437061 2.529624 2.563566 2.688840 3.183981    0
## 
## Rsquared 
##                    Min.    1st Qu.     Median       Mean   3rd Qu.      Max.
## Regression 1.604298e-03 0.09878338 0.15554560 0.16420725 0.2294914 0.3419734
## Elastic    1.862394e-02 0.09878126 0.15069644 0.16584422 0.2180598 0.3544555
## Tree       3.409078e-05 0.02727492 0.06552866 0.07550309 0.1122245 0.2273834
## RF         2.582424e-02 0.08248527 0.13694023 0.14569050 0.1864163 0.3210509
## Boosting   1.360126e-02 0.09399700 0.14955917 0.16402686 0.2181890 0.3773888
##            NA's
## Regression    0
## Elastic       0
## Tree          0
## RF            0
## Boosting      0
dotplot(model.resamples, metric = "RMSE")

splom(model.resamples, metric = "RMSE")

summary(diff(model.resamples))
## 
## Call:
## summary.diff.resamples(object = diff(model.resamples))
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## MAE 
##            Regression Elastic   Tree      RF        Boosting 
## Regression             0.026974 -0.112817 -0.006633  0.022432
## Elastic    0.00413              -0.139792 -0.033608 -0.004542
## Tree       3.053e-06  4.916e-11            0.106184  0.135249
## RF         1.00000    0.03498   2.258e-10            0.029065
## Boosting   0.30780    1.00000   2.249e-10 0.06324            
## 
## RMSE 
##            Regression Elastic   Tree      RF        Boosting 
## Regression             0.016100 -0.124330 -0.011855  0.012574
## Elastic    0.47657              -0.140430 -0.027955 -0.003526
## Tree       6.772e-06  1.079e-09            0.112475  0.136904
## RF         1.00000    0.05636   3.946e-10            0.024429
## Boosting   1.00000    1.00000   5.221e-09 0.28004            
## 
## Rsquared 
##            Regression Elastic    Tree       RF         Boosting  
## Regression            -0.0016370  0.0887042  0.0185168  0.0001804
## Elastic    1.000000               0.0903411  0.0201537  0.0018174
## Tree       6.710e-10  2.591e-10             -0.0701874 -0.0885238
## RF         0.108280   0.013471   4.844e-10             -0.0183364
## Boosting   1.000000   1.000000   8.889e-11  0.008829
RMSE.dat <- model.resamples$values %>% 
  select(Resample, ends_with("~RMSE"))

names(RMSE.dat) <- c("folds", "Logistic", "Elastic", "Tree", "RandomForest","GBM")

RMSE.dat %>% 
  pivot_longer(cols = !folds,
               names_to = "Method",
               values_to = "RMSE") %>% 
  
  ggplot(aes(x = Method, y = RMSE, color = Method)) +
  geom_point(position = position_jitter(width = 0.1))

RMSE.long <- RMSE.dat %>% 
  pivot_longer(cols = !folds,
               names_to = "Method",
               values_to = "RMSE") %>%  
  separate(folds, into =  c("Fold", "Rep"))


RMSE.long %>% 
   ggplot(aes(x = Fold, y = RMSE, color = Method)) +
  geom_point(position = position_dodge(width = 0.3), alpha = 0.3) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), 
               geom = "pointrange", position = position_dodge(width = 0.5)) +
  theme_bw()

RMSE.long %>% 
   ggplot(aes(x = Method, y = RMSE, color = Rep)) +
  geom_point(position = position_dodge(width = 0.3), alpha = 0.3) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), 
               geom = "pointrange", position = position_dodge(width = 0.5)) +
  theme_bw()

# getting cv measure for best tune :
cv.dat <- rbind(lm.fit$results[rownames(lm.fit$bestTune), c("RMSE", "RMSESD") ],
      elastic.fit2$results[rownames(elastic.fit2$bestTune), c("RMSE", "RMSESD") ],
      tree.fit4$results[rownames(tree.fit4$bestTune), c("RMSE", "RMSESD") ],
      rf.fitfinal$results[rownames(rf.fitfinal$bestTune), c("RMSE", "RMSESD")],
      gbm.fit2$results[rownames(gbm.fit2$bestTune), c("RMSE", "RMSESD")]
      )

# getting model names from resamples()
cv.dat$model <- model.resamples$models

#the number of repeated cv folds = 10 x 5 = 50
cv.N = 50

#SE = sd / sqrt(length(x))

#table:
cv.dat %>% 
  mutate(SE = RMSESD / sqrt(cv.N))
##         RMSE    RMSESD      model         SE
## 1   2.576139 0.2196639 Regression 0.03106517
## 130 2.560040 0.2142611    Elastic 0.03030109
## 11  2.700470 0.2361885       Tree 0.03340210
## 7   2.587995 0.2097109         RF 0.02965759
## 28  2.563566 0.2114444   Boosting 0.02990275
# plot: 
cv.dat %>% 
  mutate(SE = RMSESD / sqrt(cv.N)) %>% 

ggplot(aes(x = model, y = RMSE, color = model)) +
  # 95%CI for CV average RMSE
  geom_pointrange(aes(ymin = RMSE - 1.96*SE, ymax = RMSE + 1.96*SE) )

Check Variable Importance

p1 <- vip(lm.fit) + ggtitle("Regression")
p2 <- vip(elastic.fit2) + ggtitle("Elastic")
p3 <- vip(tree.fit4) + ggtitle("Tree")
p4 <- vip(rf.fitfinal) + ggtitle("RF")
p5 <- vip(gbm.fit2) + ggtitle("Boosting")

grid.arrange(p1, p2, p3, p4, p5, ncol = 5)

#combined top 3 from all:
  ##paredu
  ##sex
  #goout
  ##reason
  ##age

Part 4: Model Performance

#Getting RMSE and Rsquared for each model on test.dat
#linear regression
lm.pred <- predict(lm.fit, newdata = test.dat, type = "raw")
lm.RMSE.pred <- RMSE(lm.pred, test.dat$grades)
lm.R2.pred <- R2(lm.pred, test.dat$grades)

#elastic net
elastic.pred <- predict(elastic.fit2, newdata = test.dat, type = "raw")
elastic.RMSE.pred <- RMSE(elastic.pred, test.dat$grades)
elastic.R2.pred <- R2(elastic.pred, test.dat$grades)

#decision tree
tree.pred <- predict(tree.fit4, newdata = test.dat, type = "raw")
tree.RMSE.pred <- RMSE(tree.pred, test.dat$grades)
tree.R2.pred <- R2(tree.pred, test.dat$grades)

#random forest
rf.pred <- predict(rf.fitfinal, newdata = test.dat, type = "raw")
rf.RMSE.pred <- RMSE(rf.pred, test.dat$grades)
rf.R2.pred <- R2(rf.pred, test.dat$grades)

#boosting
gbm.pred <- predict(gbm.fit2, newdata = test.dat, type = "raw")
gbm.RMSE.pred <- RMSE(gbm.pred, test.dat$grades)
gbm.R2.pred <- R2(gbm.pred, test.dat$grades)



test.results<- data.frame(models=c("lm","elastic","tree","rf","gbm"),
                          train.RMSE=c(lm.fit$results$RMSE,
                                       elastic.fit2$results[ rownames(elastic.fit2$bestTune), ]$RMSE,
                                       tree.fit4$results [ row.names(tree.fit4$bestTune),]$RMSE,
                                       rf2.results.dat[rf2.results.dat$RMSE == min(rf2.results.dat$RMSE), ]$RMSE,
                                       gbm.fit2$results[ rownames(gbm.fit2$bestTune),]$RMSE),
                          train.R2=c(lm.fit$results$Rsquared,
                                     elastic.fit2$results[ rownames(elastic.fit2$bestTune), ]$Rsquared,
                                     tree.fit4$results [ row.names(tree.fit4$bestTune),]$Rsquared,
                                     rf2.results.dat[rf2.results.dat$RMSE == min(rf2.results.dat$RMSE), ]$Rsquared,
                                     gbm.fit2$results[ rownames(gbm.fit2$bestTune),]$Rsquared
),
                          test.RMSE=c(lm.RMSE.pred,elastic.RMSE.pred,tree.RMSE.pred,rf.RMSE.pred,gbm.RMSE.pred),
                         test.R2=c(lm.R2.pred,elastic.R2.pred,tree.R2.pred,rf.R2.pred,gbm.R2.pred))
print(test.results)
##    models train.RMSE   train.R2 test.RMSE    test.R2
## 1      lm   2.576139 0.16420725  2.782950 0.12181139
## 2 elastic   2.560040 0.16584422  2.758887 0.12428297
## 3    tree   2.700470 0.07550309  2.831194 0.08049928
## 4      rf   2.587995 0.14569050  2.758322 0.13174238
## 5     gbm   2.563566 0.16402686  2.757155 0.12735992
#minimum RMSE for train set
test.results[which.min(test.results$train.RMSE),]
##    models train.RMSE  train.R2 test.RMSE  test.R2
## 2 elastic    2.56004 0.1658442  2.758887 0.124283
#minimum RMSE for test set
test.results[which.min(test.results$test.RMSE),]
##   models train.RMSE  train.R2 test.RMSE   test.R2
## 5    gbm   2.563566 0.1640269  2.757155 0.1273599
#based on the results, we can see that elastic had the lowest RMSE and highest Rsquared in train data. However, gbm had the lowest RMSE and higher Rsquared in test data.  

Bootstrapping

boot.iter = 1000
set.seed(my.seed)

boot.dat <- data.frame(LM= vector(length = boot.iter),
                       Elastic = vector(length = boot.iter),
                       Tree = vector(length = boot.iter),
                       RF =  vector(length = boot.iter),
                       GBM = vector(length = boot.iter)
           )

for ( i in 1: boot.iter){
  
  # get a bootstrap sample:
  boot.index <- sample(   nrow(test.dat), replace = TRUE   )
  boot.sample <- test.dat[ boot.index, ]
  
  pred.dat <- data.frame( y.lm = predict(lm.fit, newdata = boot.sample),
                          y.elastic = predict(elastic.fit2, newdata = boot.sample),
                          y.tree= predict(tree.fit4, newdata = boot.sample),
                          y.rf = predict(rf.fitfinal, newdata = boot.sample),
                          y.gbm = predict(gbm.fit2, newdata = boot.sample))
  
  
  sq.err.dat <- (boot.sample$grades - pred.dat)^2 
  
  boot.MSE = colMeans(sq.err.dat)
  
  boot.RMSE = sqrt(boot.MSE)
  boot.dat[i, ] <-    boot.RMSE
  
}

head(boot.dat)
##         LM  Elastic     Tree       RF      GBM
## 1 2.665443 2.606571 2.611300 2.574718 2.602244
## 2 2.848095 2.782658 2.879957 2.739207 2.793967
## 3 2.798956 2.758910 2.728206 2.739180 2.697285
## 4 2.704563 2.656495 2.707461 2.598371 2.652230
## 5 2.452856 2.426535 2.568996 2.464149 2.463650
## 6 2.626322 2.587371 2.629527 2.550545 2.591210
mean.dat <- boot.dat %>% 
  pivot_longer(cols = everything(),
               names_to = "Model",
               values_to = "RMSE") %>% 
  group_by(Model) %>% 
  summarize(mean = mean(RMSE))

mean.dat
## # A tibble: 5 × 2
##   Model    mean
##   <chr>   <dbl>
## 1 Elastic  2.74
## 2 GBM      2.74
## 3 LM       2.76
## 4 RF       2.74
## 5 Tree     2.82
boot.dat %>% 
  pivot_longer(cols = everything(),
               names_to = "Model",
               values_to = "boot.RMSE") %>% 
  
  ggplot(aes(x = boot.RMSE, fill = Model)) +
  geom_histogram(col = "grey") +
  geom_vline(data = mean.dat, aes(xintercept = mean), col = "black", lty = 2 ) +
  facet_wrap( ~ Model, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# parametric bootstrap 95%CI: assumes normal distribution:
par.ci.dat <- data.frame(RMSE =apply(boot.dat, 2, mean, na.rm = TRUE),
                       SD = apply(boot.dat, 2, sd, na.rm = TRUE),
                error.type = "Test-bootstrap",
                model.type = c("LM","Elastic", "Tree","RF","GBM")
                      ) %>%
  group_by(model.type) %>%
  summarize( RMSE = RMSE, 
             lower = RMSE - 1.96*SD, 
             upper = RMSE + 1.96*SD,
            error.type = "Test.boot.param" )


# empirical 95%CI: percentile bootstrap CI
plyr::ldply(boot.dat, quantile, c(.025, .975))
##       .id     2.5%    97.5%
## 1      LM 2.422336 3.105772
## 2 Elastic 2.379332 3.096791
## 3    Tree 2.473934 3.170690
## 4      RF 2.378803 3.109788
## 5     GBM 2.385917 3.098516
par.ci.dat %>% 
  ggplot(aes(x = model.type, y = RMSE, color = model.type)) +
  geom_pointrange(aes(ymin = lower, 
                      ymax = upper),
                  position = position_dodge(width = 0.2))

test.ci.dat <- data.frame(RMSE = test.results$test.RMSE,
                       SD = apply(boot.dat, 2, sd, na.rm = TRUE),
                error.type = "Test-bootstrap",
                model.type = c("LM","Elastic", "Tree","RF","GBM")
                      ) %>%
  group_by(model.type) %>%
  summarize( RMSE = RMSE, 
             lower = RMSE - 1.96*SD, 
             upper = RMSE + 1.96*SD,
            error.type = "Test.boot.param" )

test.ci.dat %>% 
  ggplot(aes(x = model.type, y = RMSE, color = model.type)) +
  geom_pointrange(aes(ymin = lower, 
                      ymax = upper),
                  position = position_dodge(width = 0.2))

print(par.ci.dat)
## # A tibble: 5 × 5
##   model.type  RMSE lower upper error.type     
##   <chr>      <dbl> <dbl> <dbl> <chr>          
## 1 Elastic     2.74  2.38  3.10 Test.boot.param
## 2 GBM         2.74  2.38  3.10 Test.boot.param
## 3 LM          2.76  2.43  3.10 Test.boot.param
## 4 RF          2.74  2.38  3.11 Test.boot.param
## 5 Tree        2.82  2.46  3.17 Test.boot.param
print(test.ci.dat)
## # A tibble: 5 × 5
##   model.type  RMSE lower upper error.type     
##   <chr>      <dbl> <dbl> <dbl> <chr>          
## 1 Elastic     2.76  2.40  3.12 Test.boot.param
## 2 GBM         2.76  2.40  3.12 Test.boot.param
## 3 LM          2.78  2.44  3.12 Test.boot.param
## 4 RF          2.76  2.39  3.12 Test.boot.param
## 5 Tree        2.83  2.48  3.18 Test.boot.param

Partial-Dependence Plots

#combined top 3 from all:
  ##paredu
  ##sex
  #goout
  ##reason
  ##age

#first we try paredu, age, sex 
pdp.lm1 <- partial(lm.fit, pred.var = c("paredu", "age", "sex"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.elastic1 <- partial(elastic.fit2, pred.var = c("paredu", "age", "sex"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.tree1 <- partial(tree.fit4, pred.var = c("paredu", "age", "sex"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.rf1 <- partial(rf.fitfinal, pred.var = c("paredu", "age", "sex"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)



pdp.gbm1 <- partial(gbm.fit2, pred.var = c("paredu", "age", "sex"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)




lm.plot1 <- plotPartial(pdp.lm1, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Linear Regression"
            )

elastic.plot1 <- plotPartial(pdp.elastic1, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Regularised Regression"
            )

tree.plot1 <- plotPartial(pdp.tree1, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Decision Tree"
            )

rf.plot1 <- plotPartial(pdp.rf1, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
            main = "Random Forest"
            )

gbm.plot1 <- plotPartial(pdp.gbm1, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Gradient Boosting"
            )





grid.arrange(lm.plot1, elastic.plot1, tree.plot1, rf.plot1, gbm.plot1, ncol = 2, nrow = 3)

#now we try paredu, age, reason
pdp.lm2 <- partial(lm.fit, pred.var = c("paredu", "age", "reason"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.elastic2 <- partial(elastic.fit2, pred.var = c("paredu", "age", "reason"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.tree2 <- partial(tree.fit4, pred.var = c("paredu", "age", "reason"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.rf2 <- partial(rf.fitfinal, pred.var = c("paredu", "age", "reason"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.gbm2 <- partial(gbm.fit2, pred.var = c("paredu", "age", "reason"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)




lm.plot2 <- plotPartial(pdp.lm2, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Linear Regression"
            )

elastic.plot2 <- plotPartial(pdp.elastic2, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Regularised Regression"
            )

tree.plot2 <- plotPartial(pdp.tree2, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Decision Tree"
            )

rf.plot2 <- plotPartial(pdp.rf2, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
            main = "Random Forest"
            )

gbm.plot2 <- plotPartial(pdp.gbm2, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Gradient Boosting"
            )


grid.arrange(lm.plot2, elastic.plot2, tree.plot2, rf.plot2, gbm.plot2, ncol = 2, nrow = 3)

#there are hints of interaction across reason in decision trees
#lastly we try paredu, age, goout
pdp.lm3 <- partial(lm.fit, pred.var = c("paredu", "age", "goout"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.elastic3 <- partial(elastic.fit2, pred.var = c("paredu", "age", "goout"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.tree3 <- partial(tree.fit4, pred.var = c("paredu", "age", "goout"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)

pdp.rf3 <- partial(rf.fitfinal, pred.var = c("paredu", "age", "goout"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)



pdp.gbm3 <- partial(gbm.fit2, pred.var = c("paredu", "age", "goout"),
              plot.engine = "ggplot",
              train = train.dat,
              chull = FALSE)




lm.plot3 <- plotPartial(pdp.lm3, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Linear Regression"
            )

elastic.plot3 <- plotPartial(pdp.elastic3, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Regularised Regression"
            )

tree.plot3 <- plotPartial(pdp.tree3, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Decision Tree"
            )

rf.plot3 <- plotPartial(pdp.rf3, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
            main = "Random Forest"
            )

gbm.plot3 <- plotPartial(pdp.gbm3, levelplot = FALSE, zlab = "grades", drape = TRUE,
            colorkey = TRUE, 
            # screen = list(z = -20, x = -80),
             main = "Gradient Boosting"
            )





grid.arrange(lm.plot3, elastic.plot3, tree.plot3, rf.plot3, gbm.plot3, ncol = 2, nrow = 3)

ggplot(pdp.tree2, aes(x = paredu, y = yhat, color = age)) +
  geom_line(aes(group = as.factor(age) ) ) +
  facet_wrap(~reason)

ggplot(pdp.gbm1, aes(x = paredu, y = yhat, color = age)) +
  geom_line(aes(group = as.factor(age) ) ) +
  facet_wrap(~sex)

Additive Effects

set.seed(my.seed)
lm.add1 <- train(grades ~ . + I(paredu^2), 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )



set.seed(my.seed)
lm.add2 <- train(grades ~ . + I(paredu^2) + I(paredu^3), 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )



set.seed(my.seed)
lm.add3 <- train(grades ~ . + I(paredu^2) + I(paredu^3) + I(paredu^4), 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )



set.seed(my.seed)
lm.add4 <- train(grades ~ . + I(age^2), 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )


set.seed(my.seed)
lm.add5 <- train(grades ~ . + I(age^2) + I(age^3), 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )


set.seed(my.seed)
lm.add6 <- train(grades ~ . + I(age^2) + I(age^3) +  I(age^4), 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

lm.addresults <- rbind(lm.fit$results, lm.add1$results, lm.add2$results, lm.add3$results, lm.add4$results, lm.add5$results, lm.add6$results)
print(lm.addresults)
##   intercept     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1      TRUE 2.576139 0.1642073 2.015726 0.2196639 0.08614870 0.1693247
## 2      TRUE 2.578394 0.1632450 2.015453 0.2195351 0.08587161 0.1702570
## 3      TRUE 2.580669 0.1619890 2.017806 0.2196297 0.08588693 0.1706715
## 4      TRUE 2.584801 0.1598574 2.022239 0.2189800 0.08527003 0.1700173
## 5      TRUE 2.569060 0.1696336 2.009235 0.2213509 0.08806187 0.1689818
## 6      TRUE 2.576036 0.1662082 2.017741 0.2200569 0.08735137 0.1697163
## 7      TRUE 2.656088 0.1442942 2.049506 0.2957550 0.09242126 0.1792705
lm.addresults[which.min(lm.addresults$RMSE),]
##   intercept    RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 5      TRUE 2.56906 0.1696336 2.009235 0.2213509 0.08806187 0.1689818
#we see that lm.add4, with addititive effect of age helped reduce RMSE by a little and increase Rsquared by a little as well. Overall, not a lot of effect. 

Interaction effects

#no interactions were shown in PDPs but we will try anyway.
set.seed(my.seed)
lm.int1 <- train(grades ~ . + age:reason, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int2 <- train(grades ~ . + I(age^2) + age:reason, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )



set.seed(my.seed)
lm.int3 <- train(grades ~ . + I(age^2) + I(age^3) + age:reason, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int4 <- train(grades ~ . + age:sex, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int5 <- train(grades ~ . + I(age^2) + age:sex, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int6 <- train(grades ~ . + I(age^2) + I(age^3) + age:sex, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )
set.seed(my.seed)
lm.int7 <- train(grades ~ . + age:paredu, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int8 <- train(grades ~ . + I(age^2) + age:paredu, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int9 <- train(grades ~ . + I(age^2) + I(age^3) + age:paredu, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )
set.seed(my.seed)
lm.int10 <- train(grades ~ . + age:goout, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int11 <- train(grades ~ . + I(age^2) + age:goout, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )

set.seed(my.seed)
lm.int12 <- train(grades ~ . + I(age^2) + I(age^3) + age:goout, 
             method = "lm",
             trControl = tr.Control,
             data = train.dat,
             metric = "RMSE",
             preProcess = c("center", "scale")
             )




lm.intresults <- rbind(lm.fit$results, lm.int1$results, lm.int2$results, lm.int3$results, lm.int4$results, lm.int5$results, lm.int6$results, lm.int7$results, lm.int8$results, lm.int9$results, lm.int10$results, lm.int11$results, lm.int12$results )
print(lm.intresults)
##    intercept     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1       TRUE 2.576139 0.1642073 2.015726 0.2196639 0.08614870 0.1693247
## 2       TRUE 2.578179 0.1642603 2.025673 0.2102657 0.08328293 0.1573871
## 3       TRUE 2.574482 0.1685998 2.015499 0.2140838 0.08557726 0.1625302
## 4       TRUE 2.586206 0.1620217 2.025086 0.2117536 0.08447715 0.1632040
## 5       TRUE 2.576998 0.1645093 2.023278 0.2229984 0.08769018 0.1743435
## 6       TRUE 2.571421 0.1690507 2.014922 0.2244917 0.08889643 0.1739584
## 7       TRUE 2.577460 0.1662290 2.022782 0.2222795 0.08856331 0.1741426
## 8       TRUE 2.577878 0.1637266 2.017023 0.2206085 0.08699967 0.1676820
## 9       TRUE 2.574063 0.1672966 2.012224 0.2216906 0.08824220 0.1681112
## 10      TRUE 2.582554 0.1630163 2.021803 0.2197465 0.08695132 0.1692731
## 11      TRUE 2.586052 0.1605505 2.023655 0.2165140 0.08405205 0.1644683
## 12      TRUE 2.580254 0.1652532 2.017811 0.2186990 0.08456856 0.1608782
## 13      TRUE 2.593660 0.1583121 2.030608 0.2169911 0.08325163 0.1635244
lm.intresults[which.min(lm.intresults$RMSE),]
##   intercept     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 6      TRUE 2.571421 0.1690507 2.014922 0.2244917 0.08889643 0.1739584
#we see that lm.int5 with additive age and interaction between age and sex helped decrease RMSE and increase Rsquared by a little, but still not so much. This is also higher than the RMSE of the additive model from before.  
#since the improvement is minimal, we will not use it for further analyses. 
# get predictions
res.dat <- data.frame(test.y = test.dat$grades,
                      lm.pred, 
                      elastic.pred,
                      tree.pred,
                      rf.pred, 
                      gbm.pred)


# get residuals
res.dat <- res.dat %>% 
            mutate(
            resid.lm = test.y - lm.pred,
            resid.elastic = test.y - elastic.pred,
            resid.tree= test.y - tree.pred,
            resid.rf = test.y - rf.pred,
            resid.gbm = test.y - gbm.pred
            )
#transform to long format 
#first get a residuals only dataset, then merge with a predictions only dataset:

# 1. get residuals only data:
residuals.long <- res.dat %>% 
  select(resid.lm, resid.elastic, resid.tree, resid.rf, resid.gbm, test.y) %>% 
  pivot_longer(cols = !test.y, 
               names_to = "method", 
               values_to = "residuals")

#clean up the labels in "method"
residuals.long <- 
  residuals.long %>% 
  mutate(method = case_when(method == "resid.lm" ~ "Linear",
                            method == "resid.elastic" ~ "Elastic",
                            method == "resid.tree" ~ "Tree",
                            method == "resid.rf" ~ "RF",
                            method == "resid.gbm" ~ "GBM")
         )%>% 
  mutate(ID = 1:nrow(residuals.long))

# 2.get predictions-only data:
predictions.long <- res.dat %>% 
  select(lm.pred, elastic.pred, tree.pred, rf.pred,gbm.pred,test.y) %>% 
  pivot_longer(cols = !test.y, 
               names_to = "method", 
               values_to = "predictions")

#clean up the labels in "method"
predictions.long <- 
  predictions.long %>% 
  mutate(method = case_when(method == "lm.pred" ~ "Linear",
                            method == "elastic.pred" ~ "Elastic",
                            method == "tree.pred" ~ "Tree",
                            method == "rf.pred" ~ "RF",
                            method == "gbm.pred" ~ "GBM")
         ) %>% 
  mutate(ID = 1:nrow(predictions.long))

# 3. merge both to get the diagnostics dataset:
diag.dat <- merge(predictions.long, residuals.long)
head(diag.dat)
##     test.y  method  ID predictions  residuals
## 1 1.666667 Elastic 867   12.631518 -10.964852
## 2 1.666667 Elastic 877    9.590935  -7.924269
## 3 1.666667     GBM 870   13.007687 -11.341020
## 4 1.666667     GBM 880    8.566205  -6.899539
## 5 1.666667  Linear 866   12.043743 -10.377076
## 6 1.666667  Linear 876    9.493757  -7.827090
ggplot(diag.dat, aes(x = predictions, y = residuals)) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 0, col = "red", lty = 2) +
  geom_smooth(se = FALSE, lty = 1) +
  facet_wrap(~method) +
  theme_minimal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 12.622
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1.6218
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.6447e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 2.5722

ggplot(diag.dat, aes(x = test.y, y = predictions)) +
  geom_point(alpha = 0.2) +
  geom_abline(intercept = 0, slope = 1, col = "red", lty = 2) +
  geom_smooth(se = FALSE, lty = 1) +
  facet_wrap(~method) +
  theme_minimal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

dv = "grades"
pred.vars <- names(sac.dat)[!names(sac.dat)%in% dv]
explainer.dat <- test.dat

# build explainer for linear regression:
lm.exp <- DALEX::explain(model = lm.fit, 
                  data = explainer.dat[ , pred.vars],
                  y = explainer.dat[ , dv], 
                  label = "Linear Regression",
                  type = "regression")
## Preparation of a new explainer is initiated
##   -> model label       :  Linear Regression 
##   -> data              :  192  rows  10  cols 
##   -> target variable   :  192  values 
##   -> predict function  :  yhat.train  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package caret , ver. 7.0.1 , task regression (  default  ) 
##   -> model_info        :  type set to  regression 
##   -> predicted values  :  numerical, min =  8.445585 , mean =  11.64295 , max =  15.1418  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -10.37708 , mean =  -0.1099685 , max =  7.464469  
##   A new explainer has been created!
# build explainer for elastic net:
elastic.exp <- DALEX::explain(model = elastic.fit2, 
                       data = explainer.dat[ , pred.vars], 
                       y = explainer.dat[, dv], 
                       label = "Elastic",
                       type = "regression")
## Preparation of a new explainer is initiated
##   -> model label       :  Elastic 
##   -> data              :  192  rows  10  cols 
##   -> target variable   :  192  values 
##   -> predict function  :  yhat.train  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package caret , ver. 7.0.1 , task regression (  default  ) 
##   -> model_info        :  type set to  regression 
##   -> predicted values  :  numerical, min =  9.225666 , mean =  11.65599 , max =  14.47265  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -10.96485 , mean =  -0.1230071 , max =  7.016845  
##   A new explainer has been created!
# build explainer for tree:
tree.exp <- DALEX::explain(model = tree.fit4, 
                    data = explainer.dat[ , pred.vars], 
                    y = explainer.dat[, dv], 
                    label = "Tree",
                    type = "regression")
## Preparation of a new explainer is initiated
##   -> model label       :  Tree 
##   -> data              :  192  rows  10  cols 
##   -> target variable   :  192  values 
##   -> predict function  :  yhat.train  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package caret , ver. 7.0.1 , task regression (  default  ) 
##   -> model_info        :  type set to  regression 
##   -> predicted values  :  numerical, min =  9.013889 , mean =  11.70908 , max =  12.60381  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -10.93714 , mean =  -0.1760946 , max =  6  
##   A new explainer has been created!
# build explainer for RF:
rf.exp <- DALEX::explain(model = rf.fitfinal, 
                  data = explainer.dat[ , pred.vars], 
                  y = explainer.dat[, dv], 
                  label = "Random Forest",
                  type = "regression")
## Preparation of a new explainer is initiated
##   -> model label       :  Random Forest 
##   -> data              :  192  rows  10  cols 
##   -> target variable   :  192  values 
##   -> predict function  :  yhat.train  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package caret , ver. 7.0.1 , task regression (  default  ) 
##   -> model_info        :  type set to  regression 
##   -> predicted values  :  numerical, min =  9.453839 , mean =  11.67044 , max =  13.24907  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -11.15196 , mean =  -0.1374501 , max =  5.935747  
##   A new explainer has been created!
# build explainer for GBM:
gbm.exp <- DALEX::explain(model = gbm.fit2, 
                   data = explainer.dat[ , pred.vars], 
                   y= explainer.dat[, dv], 
                   label = "GBM",
                   type = "regression")
## Preparation of a new explainer is initiated
##   -> model label       :  GBM 
##   -> data              :  192  rows  10  cols 
##   -> target variable   :  192  values 
##   -> predict function  :  yhat.train  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package caret , ver. 7.0.1 , task regression (  default  ) 
##   -> model_info        :  type set to  regression 
##   -> predicted values  :  numerical, min =  8.256733 , mean =  11.62835 , max =  14.57703  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -11.34102 , mean =  -0.09536118 , max =  7.176075  
##   A new explainer has been created!
set.seed(my.seed)
vip_lm  <- model_parts(explainer = lm.exp,  B = 50, N = NULL)
set.seed(my.seed)
vip_elastic  <- model_parts(explainer = elastic.exp,  B = 50, N = NULL)
set.seed(my.seed)
vip_tree  <- model_parts(explainer = tree.exp,  B = 50, N = NULL)
set.seed(my.seed)
vip_rf  <- model_parts(explainer = rf.exp,  B = 50, N = NULL)
set.seed(my.seed)
vip_gbm <- model_parts(explainer = gbm.exp, B = 50, N = NULL)
plot(vip_lm, vip_elastic,vip_tree, vip_rf, vip_gbm) +
  ggtitle("Mean variable-importance over 50 permutations", "") 

Create Partial-Dependence Profiles

pdp_lm.age <- model_profile(explainer = lm.exp, variables = "age")
pdp_elastic.age <- model_profile(explainer = elastic.exp, variables = "age")
pdp_tree.age <- model_profile(explainer = tree.exp, variables = "age")
pdp_rf.age <- model_profile(explainer = rf.exp, variables = "age")
pdp_gbm.age <- model_profile(explainer = gbm.exp, variables = "age")


plot(pdp_lm.age, pdp_elastic.age, pdp_tree.age, pdp_rf.age, pdp_gbm.age) +  
  ggtitle("Partial-dependence profile for age") 

pdp_lm.paredu <- model_profile(explainer = lm.exp, variables = "paredu")
pdp_elastic.paredu <- model_profile(explainer = elastic.exp, variables = "paredu")
pdp_tree.paredu <- model_profile(explainer = tree.exp, variables = "paredu")
pdp_rf.paredu <- model_profile(explainer = rf.exp, variables = "paredu")
pdp_gbm.paredu <- model_profile(explainer = gbm.exp, variables = "paredu")


plot(pdp_lm.paredu, pdp_elastic.paredu, pdp_tree.paredu, pdp_rf.paredu, pdp_gbm.paredu) +  
  ggtitle("Partial-dependence profile for paredu") 

Create Ceteris-Paribus Profiles

plot(pdp_lm.paredu, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for paredu (lm)")

plot(pdp_elastic.paredu, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for paredu (elastic)")

plot(pdp_tree.paredu, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for paredu (tree)")

plot(pdp_rf.paredu, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for paredu (rf)") 

plot(pdp_gbm.paredu, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for paredu (gbm)") 

plot(pdp_lm.age, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for age (lm)") 

plot(pdp_elastic.age, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for age (elastic)")

plot(pdp_tree.age, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for age (tree)") 

plot(pdp_rf.age, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for age (rf)") 

plot(pdp_gbm.age, geom = "profiles") + 
    ggtitle("Ceteris-paribus and partial-dependence profiles for age (gbm)")

Part 5: Model Prediction Test

nesh.dat <- data.frame(
  sex = "Male",
  age = 17,
  paredu = 7,
  famsup = "yes",
  activities = "no",
  romantic = "yes",
  freetime = "1",
  health = "4",
  goout = "1",
  reason = "home"
  )


predict(lm.fit, newdata = nesh.dat)
##        1 
## 10.19554
predict(elastic.fit2, newdata = nesh.dat)
## [1] 11.24229
predict(tree.fit4, newdata = nesh.dat)
##        1 
## 12.60381
predict(rf.fitfinal, newdata = nesh.dat)
##       1 
## 12.0188
predict(gbm.fit2, newdata = nesh.dat)
## [1] 11.74516
cp.lm <- predict_profile(explainer = lm.exp, 
                           new_observation = nesh.dat)


cp.elastic <- predict_profile(explainer = elastic.exp, 
                           new_observation = nesh.dat)

cp.tree <- predict_profile(explainer = tree.exp, 
                           new_observation = nesh.dat)

cp.rf <- predict_profile(explainer = rf.exp, 
                           new_observation = nesh.dat)

cp.gbm <- predict_profile(explainer = gbm.exp, 
                           new_observation = nesh.dat)
cp.lm
## Top profiles    : 
##         sex age paredu famsup activities romantic freetime health goout reason
## 1    Female  17      7    yes         no      yes        1      4     1   home
## 1.1    Male  17      7    yes         no      yes        1      4     1   home
## 11     Male  15      7    yes         no      yes        1      4     1   home
## 1.11   Male  16      7    yes         no      yes        1      4     1   home
## 1.2    Male  17      7    yes         no      yes        1      4     1   home
## 1.3    Male  18      7    yes         no      yes        1      4     1   home
##         _yhat_ _vname_ _ids_           _label_
## 1    11.386205     sex     1 Linear Regression
## 1.1  10.195544     sex     1 Linear Regression
## 11   10.624303     age     1 Linear Regression
## 1.11 10.409924     age     1 Linear Regression
## 1.2  10.195544     age     1 Linear Regression
## 1.3   9.981165     age     1 Linear Regression
## 
## 
## Top observations:
##    sex age paredu famsup activities romantic freetime health goout reason
## 1 Male  17      7    yes         no      yes        1      4     1   home
##     _yhat_           _label_ _ids_
## 1 10.19554 Linear Regression     1
cp.elastic
## Top profiles    : 
##         sex age paredu famsup activities romantic freetime health goout reason
## 1    Female  17      7    yes         no      yes        1      4     1   home
## 1.1    Male  17      7    yes         no      yes        1      4     1   home
## 11     Male  15      7    yes         no      yes        1      4     1   home
## 1.11   Male  16      7    yes         no      yes        1      4     1   home
## 1.2    Male  17      7    yes         no      yes        1      4     1   home
## 1.3    Male  18      7    yes         no      yes        1      4     1   home
##        _yhat_ _vname_ _ids_ _label_
## 1    12.14735     sex     1 Elastic
## 1.1  11.24229     sex     1 Elastic
## 11   11.58860     age     1 Elastic
## 1.11 11.41544     age     1 Elastic
## 1.2  11.24229     age     1 Elastic
## 1.3  11.06914     age     1 Elastic
## 
## 
## Top observations:
##    sex age paredu famsup activities romantic freetime health goout reason
## 1 Male  17      7    yes         no      yes        1      4     1   home
##     _yhat_ _label_ _ids_
## 1 11.24229 Elastic     1
cp.tree
## Top profiles    : 
##         sex age paredu famsup activities romantic freetime health goout reason
## 1    Female  17      7    yes         no      yes        1      4     1   home
## 1.1    Male  17      7    yes         no      yes        1      4     1   home
## 11     Male  15      7    yes         no      yes        1      4     1   home
## 1.11   Male  16      7    yes         no      yes        1      4     1   home
## 1.2    Male  17      7    yes         no      yes        1      4     1   home
## 1.3    Male  18      7    yes         no      yes        1      4     1   home
##        _yhat_ _vname_ _ids_ _label_
## 1    12.60381     sex     1    Tree
## 1.1  12.60381     sex     1    Tree
## 11   12.60381     age     1    Tree
## 1.11 12.60381     age     1    Tree
## 1.2  12.60381     age     1    Tree
## 1.3  12.60381     age     1    Tree
## 
## 
## Top observations:
##    sex age paredu famsup activities romantic freetime health goout reason
## 1 Male  17      7    yes         no      yes        1      4     1   home
##     _yhat_ _label_ _ids_
## 1 12.60381    Tree     1
cp.rf
## Top profiles    : 
##         sex age paredu famsup activities romantic freetime health goout reason
## 1    Female  17      7    yes         no      yes        1      4     1   home
## 1.1    Male  17      7    yes         no      yes        1      4     1   home
## 11     Male  15      7    yes         no      yes        1      4     1   home
## 1.11   Male  16      7    yes         no      yes        1      4     1   home
## 1.2    Male  17      7    yes         no      yes        1      4     1   home
## 1.3    Male  18      7    yes         no      yes        1      4     1   home
##        _yhat_ _vname_ _ids_       _label_
## 1    12.74237     sex     1 Random Forest
## 1.1  12.01880     sex     1 Random Forest
## 11   12.03692     age     1 Random Forest
## 1.11 12.01396     age     1 Random Forest
## 1.2  12.01880     age     1 Random Forest
## 1.3  11.93738     age     1 Random Forest
## 
## 
## Top observations:
##    sex age paredu famsup activities romantic freetime health goout reason
## 1 Male  17      7    yes         no      yes        1      4     1   home
##    _yhat_       _label_ _ids_
## 1 12.0188 Random Forest     1
cp.gbm
## Top profiles    : 
##         sex age paredu famsup activities romantic freetime health goout reason
## 1    Female  17      7    yes         no      yes        1      4     1   home
## 1.1    Male  17      7    yes         no      yes        1      4     1   home
## 11     Male  15      7    yes         no      yes        1      4     1   home
## 1.11   Male  16      7    yes         no      yes        1      4     1   home
## 1.2    Male  17      7    yes         no      yes        1      4     1   home
## 1.3    Male  18      7    yes         no      yes        1      4     1   home
##        _yhat_ _vname_ _ids_ _label_
## 1    12.63389     sex     1     GBM
## 1.1  11.74516     sex     1     GBM
## 11   11.73456     age     1     GBM
## 1.11 11.74402     age     1     GBM
## 1.2  11.74516     age     1     GBM
## 1.3  11.40870     age     1     GBM
## 
## 
## Top observations:
##    sex age paredu famsup activities romantic freetime health goout reason
## 1 Male  17      7    yes         no      yes        1      4     1   home
##     _yhat_ _label_ _ids_
## 1 11.74516     GBM     1
plot(cp.lm, variables = c("age", "paredu")) +
  ggtitle("Ceteris-paribus profile", "")

plot(cp.elastic, variables = c("age", "paredu")) +
  ggtitle("Ceteris-paribus profile", "")

plot(cp.tree, variables = c("age", "paredu")) +
  ggtitle("Ceteris-paribus profile", "")

plot(cp.rf, variables = c("age", "paredu")) +
  ggtitle("Ceteris-paribus profile", "")

plot(cp.gbm, variables = c("age", "paredu")) +
  ggtitle("Ceteris-paribus profile", "")

plot(cp.lm, cp.elastic, cp.tree, cp.rf, cp.gbm,color = "_label_",  
     variables = c("age", "paredu")) +
     ggtitle("Ceteris-paribus profiles for Nesh", "") 

nesh.dat.1 <- data.frame(
  sex = "Male",
  age = 17,
  paredu = 7,
  famsup = "yes",
  activities = "no",
  romantic = "yes",
  freetime = "1",
  health = "4",
  goout = "1",
  reason = "home"
  )

nesh.dat.2 <- data.frame(
  sex = "Male",
  age = 17,
  paredu = 7,
  famsup = "yes",
  activities = "no",
  romantic = "yes",
  freetime = "1",
  health = "4",
  goout = "1",
  reason = "reputation"
  )




cp.elastic1 <- predict_profile(explainer = elastic.exp, 
                         new_observation = rbind(nesh.dat.1,
                                                 nesh.dat.2))
plot(cp.elastic1 , color = "_ids_",
     variables = c("age", "paredu")) +
    scale_color_manual(name = "Nesh:", breaks = 1:2, 
            values = c("red", "blue"), 
            labels = c("home" , "reputation")) +
  ggtitle("Ceteris-paribus profile elastic", "")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

cp.gbm1 <- predict_profile(explainer = gbm.exp, 
                         new_observation = rbind(nesh.dat.1,
                                                 nesh.dat.2))
plot(cp.gbm1 , color = "_ids_",
     variables = c("age", "paredu")) +
    scale_color_manual(name = "Nesh:", breaks = 1:2, 
            values = c("red", "blue"), 
            labels = c("home" , "reputation")) +
  ggtitle("Ceteris-paribus profile gbm", "")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

bd.elastic <- predict_parts(explainer = elastic.exp, 
                       new_observation = nesh.dat,
                       type = "break_down")

bd.elastic
##                          contribution
## Elastic: intercept             11.656
## Elastic: paredu = 7             0.730
## Elastic: sex = Male            -0.547
## Elastic: goout = 1             -0.314
## Elastic: romantic = yes        -0.167
## Elastic: health = 4             0.156
## Elastic: famsup = yes          -0.148
## Elastic: activities = no       -0.073
## Elastic: age = 17              -0.041
## Elastic: reason = home         -0.039
## Elastic: freetime = 1           0.029
## Elastic: prediction            11.242
plot(bd.elastic)

bd.gbm<- predict_parts(explainer = gbm.exp, 
                       new_observation = nesh.dat,
                       type = "break_down")

bd.gbm
##                      contribution
## GBM: intercept             11.628
## GBM: paredu = 7             0.852
## GBM: sex = Male            -0.537
## GBM: goout = 1             -0.215
## GBM: health = 4             0.196
## GBM: age = 17               0.191
## GBM: romantic = yes        -0.142
## GBM: famsup = yes          -0.139
## GBM: reason = home         -0.104
## GBM: freetime = 1           0.073
## GBM: activities = no       -0.059
## GBM: prediction            11.745
plot(bd.gbm)